Introduction

To investigate ride behavior differences between casual and member users and uncover temporal and spatial patterns in ride activity, a comprehensive and well-structured database is essential. The analysis focuses on understanding how ride patterns vary across time—daily, weekly, and seasonally—and space—stations and routes—while identifying trends in ride duration, station popularity, and overall demand. These insights are critical for guiding Divvy’s operational decisions and marketing strategies.

The source data for this project consists of 12 monthly Divvy trip datasets for the year 2024, containing ride-level information such as ride identifiers, timestamps, start and end stations, and user type (casual vs. member). To efficiently support analysis, a relational database will be designed to:

  • Consolidate the monthly datasets into a single, queryable structure.
  • Maintain data integrity with primary keys and appropriate data types for timestamps, text fields, and identifiers.
  • Enable temporal analysis by storing ride start and end times in a standardized timestamp format.
  • Support spatial analysis by including station names and IDs, allowing examination of station popularity and route patterns.
  • Facilitate user segmentation by distinguishing between casual and member riders.

By implementing this database, analysts will be able to efficiently query and aggregate data, uncover patterns in ride behavior, and generate actionable insights for Divvy’s operational planning and marketing initiatives.

Divvy Database Creation

Database connection

Establish postgresql database connection

# Read config
config <- read.ini("resources/db_config.ini")
db <- config$postgresql

# Safe database connection
tryCatch({
  con <- dbConnect(
    Postgres(),
    host = db$host,
    dbname = db$database,
    user = db$user,
    password = db$password,
    port = as.integer(db$port)
  )
}, error = function(e) {
  stop("Database connection failed: ", e$message)
})

# Register connection for SQL chunks
knitr::opts_chunk$set(connection = con)

Database schema

Create database schema to the database

CREATE SCHEMA IF NOT EXISTS divvy;

Database tables

Read data into R environment

# Define year and months
year <- "2024"
months <- sprintf("%02d", 1:12)

# Initialize list to store data
divvy_data <- list()

# Loop through months and read each file
for (m in months) {
  file_path <- paste0("resources/data/", year, m, "-divvy-tripdata.csv")
  month_name <- tolower(format(as.Date(paste0(year, "-", m, "-01")), "%B"))
  
  divvy_data[[month_name]] <- read_csv(file_path, show_col_types = FALSE)
}

Drop tables if already exists

# Define all month names
months <- tolower(month.name)

for (m in months) {
  sql <- glue::glue("DROP TABLE IF EXISTS divvy.{m};")
  DBI::dbExecute(con, sql)
}

January table

Create and load data for the month of January

# Create table
create_jan_table <- "
  CREATE TABLE divvy.january (
      ride_id             TEXT PRIMARY KEY,
      rideable_type       TEXT,
      started_at          TIMESTAMP,
      ended_at            TIMESTAMP,
      start_station_name  TEXT,
      start_station_id    TEXT,
      end_station_name    TEXT,
      end_station_id      TEXT,
      member_casual       TEXT
  );
"
DBI::dbExecute(con, create_jan_table)
## [1] 0
# Write to PostgreSQL (replaces table if exists)
dbWriteTable(
  conn      = con,
  name      = DBI::Id(schema = "divvy", table = "january"),
  value     = divvy_data$january,
  overwrite = TRUE,   # drop & recreate table
  row.names = FALSE
)

February table

Create and load data for the month of February

# Months we want to load
months <- names(divvy_data)   # "january", "february", ... "december"

for (month_name in months) {
  
  # SQL to create the table
  create_sql <- glue("
    CREATE TABLE divvy.{month_name} (
        ride_id             TEXT PRIMARY KEY,
        rideable_type       TEXT,
        started_at          TIMESTAMP,
        ended_at            TIMESTAMP,
        start_station_name  TEXT,
        start_station_id    TEXT,
        end_station_name    TEXT,
        end_station_id      TEXT,
        member_casual       TEXT
    );
  ")
  
  # Drop if already exists, then create fresh table
  DBI::dbExecute(con, glue("DROP TABLE IF EXISTS divvy.{month_name};"))
  DBI::dbExecute(con, create_sql)
  
  # Write corresponding R dataframe into PostgreSQL
  dbWriteTable(
    conn      = con,
    name      = DBI::Id(schema = "divvy", table = month_name),
    value     = divvy_data[[month_name]],
    overwrite = TRUE,   # ensures table is replaced if already exists
    row.names = FALSE
  )
  
}

Database views

SELECT COUNT(*) FROM divvy.january;
1 records
count
144873
SELECT COUNT(*) FROM divvy.february;
1 records
count
223164
SELECT COUNT(*) FROM divvy.june;
1 records
count
710721
SELECT COUNT(*) FROM divvy.november;
1 records
count
335075
SELECT COUNT(*) FROM divvy.december;
1 records
count
178372

Database functions

Exploratory Data Analysis

Conclusion